R Laboratory – Exercise 4

Guglielmo Bordin

14 May 2023

Insurance claims

The number of claims received by an insurance company during a week follows a Poisson distribution with unknown mean \(\mu\). The claims per week observed over a ten week period are:

5, 8, 4, 6, 11, 6, 6, 5, 6, 4

We’ll start by computing the posterior distribution with the assumption of a uniform prior for \(\mu\). Let’s first define a helpful bayes function to calculate everything we need in the analysis.

bayes <- function(lhood, prior, max, cred_int = c(0.025, 0.975)) {
    dx <- 0.001
    x <- seq(0, max, by = dx)
    # get denominator (evidence)
    evid <- integrate(\(y) lhood(y) * prior(y), 0, max)$value
    # assemble posterior
    post <- lhood(x) * prior(x) / evid
    cdf <- cumsum(post) * dx

    # get posterior mode, median, mean and std
    mode <- x[which.max(post)]
    median <- x[which.max(cdf > 0.5)]
    mean <- sum(x * post) * dx
    std <- sqrt(sum((x - mean)^2 * post) * dx)

    # get credibility interval
    cred <- sapply(cred_int, \(c) x[which.max(cdf > c)])

    # return named list
    list(
        post = post, mode = mode, med = median,
        mean = mean, std = std, cred = cred
    )
}

Now, let’s start with the uniform prior.

claims <- c(5, 8, 4, 6, 11, 6, 6, 5, 6, 4)
n <- length(claims)

# poissonian likelihood
lhood <- \(m) m^sum(claims) * exp(-n * m)

max_mu <- 1.5 * max(claims)
dmu <- 0.001
# g(mu) = 1 for all mu --> improper prior (divergent integral)
post_unif <- bayes(lhood, \(m) 1, max = max_mu)
Posterior parameters
Mode Median Mean St. dev. 95% cred. int.
6.1 6.167 6.2 0.7874008 4.754, 7.836

Now, we’ll use a Jeffrey’s prior instead, that is

\[ g(\mu) = \frac{1}{\sqrt{\mu}}. \]

This is another improper prior, but it’s not a problem since we normalize numerically the posterior inside the bayes function.

To handle the value at 0 correctly, we also set \(g(\mu) = 0\) for \(\mu = 0\), to avoid a divergence in the posterior.

post_jeff <- bayes(lhood, \(m) ifelse(m > 0, 1 / sqrt(m), 0), max_mu)
Posterior parameters
Mode Median Mean St. dev. 95% cred. int.
6.05 6.117 6.15 0.7842194 4.71, 7.779

Let’s plot the two posteriors with their credibility intervals.

my_pal <- wesanderson::wes_palette("Zissou1", 5)[c(1, 3, 5)]
theme_set(theme_minimal(base_size = 14, base_family = "Open Sans"))

prior_names <- c(unif = "Uniform", jeff = "Jeffrey’s")
prior_cols <- my_pal[1:2] |> set_names("unif", "jeff")
tibble(
    mu = seq(0, max_mu, along.with = post_unif$post),
    unif = post_unif$post,
    jeff = post_jeff$post,
) |>
    # transform to long format for ggplot
    pivot_longer(-1, names_to = "dist", values_to = "f") |>
    # reorder `dist` factors to follow the order in the text
    mutate(dist = fct_relevel(dist, "unif", "jeff")) |>
    ggplot(aes(x = mu, y = f, colour = dist)) +
        geom_line(linewidth = 0.8) +
        # shade 95% credibility intervals
        geom_area(
            aes(fill = dist),
            data = function(x) {
                x |> group_by(dist) |>
                     filter(mu > mu[which.max(cumsum(f) * dmu > 0.025)] &
                            mu < mu[which.max(cumsum(f) * dmu > 0.975)])
            },
            position = "identity",
            alpha = 0.5, show.legend = FALSE
        ) +
        scale_colour_manual(values = prior_cols, labels = prior_names) +
        scale_fill_manual(values = prior_cols) +
        labs(
            x = "Poisson parameter μ",
            y = "Posterior distribution P(μ | y)",
            colour = "Prior"
        ) +
        coord_cartesian(expand = FALSE, ylim = c(0, 0.55))

Finally, let’s approximate the two posteriors with a normal distribution with the same mean and standard deviation, and compare the credibility intervals.

thr <- c(0.025, 0.975)

tibble(
    prior = c("Uniform", "Jeffrey’s"),
    mean = c(post_unif$mean, post_jeff$mean),
    std = c(post_unif$std, post_jeff$std),
    cred = c(
        paste(post_unif$cred, collapse = ", "),
        paste(post_jeff$cred, collapse = ", ")
    ),
    cred_n = c(
        paste(
            format(qnorm(thr, post_unif$mean, post_unif$std), digits = 4),
            collapse = ", "
        ),
        paste(
            format(qnorm(thr, post_jeff$mean, post_jeff$std), digits = 4),
            collapse = ", "
        )
    )
) |>
    mutate_if(is.numeric, format, digits = 4) |>
    set_names(
        "Prior", "Mean", "St. dev.",
        "95% cred. int.", "95% cred. int. (normal approx.)"
    ) |>
    kableExtra::kbl(caption = "Posterior parameters") |>
    kableExtra::kable_styling(full_width = FALSE)
Posterior parameters
Prior Mean St. dev. 95% cred. int. 95% cred. int. (normal approx.)
Uniform 6.20 0.7874 4.754, 7.836 4.657, 7.743
Jeffrey’s 6.15 0.7842 4.71, 7.779 4.613, 7.687

The normal approximation works well enough in both cases; after all, with 10 data points this was to be expected.

m <- seq(0, max_mu, along.with = post_unif$post)
type_cols <- my_pal[c(1, 3)]

tibble(
    mu = m,
    unif = post_unif$post,
    unif_n = dnorm(m, post_unif$mean, post_unif$std),
    jeff = post_jeff$post,
    jeff_n = dnorm(m, post_jeff$mean, post_jeff$std)
) |>
    pivot_longer(-mu, names_to = "dist", values_to = "f") |>
    # encode the original/approx. distinction in separate column
    mutate(
        type = ifelse(str_ends(dist, "_n"), "appr", "orig"),
        dist = str_remove(dist, "_n")
    ) |>
    # reorder factors as needed
    mutate(
        dist = fct_relevel(dist, "unif", "jeff"),
        type = fct_relevel(type, "orig", "appr")
    ) |>
    ggplot(aes(x = mu, y = f, colour = type)) +
        geom_line(linewidth = 0.8) +
        geom_area(
            aes(fill = type),
            data = function(x) {
                x |> group_by(dist, type) |>
                     filter(mu > mu[which.max(cumsum(f) * dmu > 0.025)] &
                            mu < mu[which.max(cumsum(f) * dmu > 0.975)])
            },
            position = "identity",
            alpha = 0.5, show.legend = FALSE
        ) +
        scale_colour_manual(
            values = type_cols, labels = c("Original", "Normal appr.")
        ) +
        scale_fill_manual(values = type_cols) +
        labs(
            x = "Poisson parameter μ",
            y = "Posterior distribution P(μ | y)",
            colour = "Type"
        ) +
        coord_cartesian(expand = FALSE, ylim = c(0, 0.55)) +
        facet_grid(
            rows = vars(dist),
            labeller = as_labeller(
                c(unif = "Uniform prior", jeff = "Jeffrey’s prior")
            )
        )

Blood tests

A well established method for detecting a disease in blood fails to detect it in 15% of the patients that actually have the disease.

A young UniPd start-up has developed an innovative method of screening. During the qualification phase, a random sample of \(n = 75\) patients known to have the disease is screened using the new method.

The probability distribution of \(y\), the number of times the new method fails to detect the disease, is a binomial with \(n = 75\). The method fails to detect the disease in 6 cases. Following a frequentist approach, we would say that an unbiased estimator for the probability of our binomial distribution is given by

\[ \hat{p}_\mathrm{f} = \frac{y}{n} = \frac{6}{75} = 0.08, \]

with standard deviation

\[ \sigma = \sqrt{\operatorname{Var}[\hat{p}_\mathrm{f}]} = \sqrt{\frac{y}{n^2}\biggl(1 - \frac{y}{n}\biggr)} = 0.031. \]

Let’s proceed now with a bayesian computation, assuming a beta prior with mean value \(0.15\) and standard deviation \(0.14\). We know from the theory that the posterior distribution of a binomial process with a beta prior is given by another beta distribution, with parameters

\[ \alpha = \alpha_\mathrm{p} + y, \quad \beta = \beta_\mathrm{p} + n - y, \]

with \(\alpha_\mathrm{p}\), \(\beta_\mathrm{p}\) the parameters of the prior. However, we’ll still use the bayes function for the moment.

y <- 6
n <- 75

mu <- 0.15
sig <- 0.14

# formulas to get alpha and beta from mean and sigma
(alpha <- mu * (mu * (1 - mu) / sig^2 - 1))
## [1] 0.8257653
(beta <- (1 - mu) * alpha / mu)
## [1] 4.679337
post_beta <- bayes(
    lhood = \(p) p**y * (1 - p)**(n - y),
    # handle the 0 value correctly
    prior = \(p) ifelse(p > 0, dbeta(p, alpha, beta), 0),
    max = 1
)
Posterior parameters
Mode Median Mean St. dev. 95% cred. int.
0.074 0.081 0.0847867 0.0308555 0.035, 0.154

We can see that the peak of the posterior is shifted a bit to the left of the frequentist expected value \(0.08\). This is due to the prior’s influence: let’s plot the posterior together with the prior and see what is happening.

p <- seq(0, 1, along.with = post_beta$post)
tibble(p, fp = post_beta$post) |>
    ggplot(aes(x = p, y = fp)) +
        geom_line(
            aes(x = p, y = fp, linetype = "prior"),
            data = tibble(p, fp = dbeta(.data$p, alpha, beta)),
            linewidth = 0.8, colour = my_pal[1]
        ) +
        geom_line(
            aes(linetype = "post"),
            linewidth = 0.8, colour = my_pal[1]
        ) +
        # standard deviation shading
        geom_area(
            data = function(x) {
                m <- post_beta$mean
                s <- post_beta$std
                filter(x, p > m - s & p < m + s)
            },
            colour = my_pal[1], fill = my_pal[1],
            alpha = 0.5, show.legend = FALSE
        ) +
        # mean line
        geom_vline(xintercept = post_beta$mean, colour = my_pal[1]) +
        annotate(
            "label", x = 0.1, y = 13.5, hjust = 0,
            label = paste(
                "Mean ± Std =",
                format(post_beta$mean, digits = 2), "±",
                format(post_beta$std, digits = 2)
            )
        ) +
        scale_linetype_manual(
            values = c(post = "solid", prior = "dotted"),
            labels = c(post = "Posterior", prior = "Prior")
        ) +
        labs(
            x = "Binomial probability",
            y = element_blank(),
            linetype = "Distribution"
        ) +
        coord_cartesian(expand = FALSE, ylim = c(0, 15))

The selected prior places high values near \(\pi = 0\), causing the peak of the likelihood (which is precisely 0.08) to shift slightly towards the left. At the same time, the slow decay of the prior towards the right is making the posterior distribution more asymmetric, leading to a shift of the mean value towards the right, 0.085.

Now, let’s conduct a hypothesis test. We assume that if the probability of failing to detect the disease in sick patients is greater than or equal to 15%, the new test is no better than the traditional method. We’ll test the sample at a 5% level of significance, continuining with the Bayesian approach.

We can formulate our null hypothesis as follows: the binomial parameter of the posterior distribution for the new method is \(p \geq p_0\), where \(p_0 = 0.15\) is the probability of failure associated with the traditional test. Of course, the alternative hypothesis is then \(p < p_0\), i.e. the new test is better than the other.

In the Bayesian framework, we need to calculate the probability of observing the data under the null hypothesis by integrating the posterior probability over the region where the null hypothesis is true. If this probability is smaller than the chosen level of significance \(\alpha = 0.05\), we reject the null hypothesis and accept the alternative one.

To take advantage of R’s integrate function, we’ll redefine the posterior as a function dbeta with the posterior parameters that we previously discussed.

post <- \(x) dbeta(x, alpha + y, beta + n - y)
p0 <- 0.15

integrate(post, p0, 1)
## 0.03127933 with absolute error < 8.2e-07

We can see from the analysis that the posterior probability of the null hypothesis (i.e., that the new test is no better than the old one) is smaller than the chosen significance level of 0.05. Therefore, we can accept the alternative hypothesis and say that the new test is better than the old one at a 5% level of significance.

Let’s highlight the integration region and the level of significance on the posterior plot.

# significance level from quantile function
signif <- qbeta(1 - 0.05, alpha + y, beta + n - y)
prob_h0 <- integrate(post, p0, 1)$value

tibble(p, post = post(p)) |>
    ggplot(aes(x = p, y = post)) +
        geom_line(linewidth = 0.8, colour = my_pal[1]) +
        # shading the area over p0
        geom_area(
            data = \(x) x |> filter(p > p0),
            colour = my_pal[1], fill = my_pal[1],
            alpha = 0.5, show.legend = FALSE
        ) +
        annotate(
            "label", x = 0.16, y = 1.5,
            label = paste("P(H₀ | y) =", format(prob_h0, digits = 3)),
            hjust = 0, colour = my_pal[1]
        ) +
        # annotate the significance level
        geom_segment(
            aes(x = signif, y = 0, xend = signif, yend = post(signif)),
            colour = my_pal[3]
        ) +
        annotate(
            "label", x = signif - 0.005, y = 1, label = "α = 0.05",
            hjust = 1, colour = my_pal[3]
        ) +
        labs(
            x = "Binomial probability p",
            y = "Posterior distribution P(p | y)"
        ) +
        coord_cartesian(expand = FALSE, xlim = c(0, 0.5), ylim = c(0, 15))

Now, let’s redo the same hypothesis test in the classical frequentist way. We have to setup a null distribution: in this case it’s the binomial distribution with the probability parameter set to \(p_0 = 0.15\), the failure rate of the traditional test.

Next, we need to determine the probability of obtaining 6 or fewer incorrect screenings out of a total of 75, under the assumption that the null hypothesis is true, i.e. that the new test follows a binomial distribution with parameters \(n=75\) and \(p=0.15\).

cat(pbinom(6, 75, 0.15))
## 0.0543533

This is bigger than the chosen significance level of 5%! So, a frequentist would reject the null hypothesis and say that we cannot claim with 95% confidence that the new test is better than the old one.

tibble(
    fails = 0:12,
    prob = pbinom(fails, 75, 0.15),
    test = ifelse(fails == 6, "new", "old")
) |>
    ggplot(aes(x = fails, y = prob)) +
        geom_col(aes(fill = test)) +
        scale_fill_manual(
            values = c(old = my_pal[1], new = my_pal[2]),
            guide = "none"
        ) +
        geom_hline(yintercept = 0.05, colour = my_pal[3]) +
        annotate(
            "text", x = 2, y = 0.05, vjust = -0.7, label = "Accept H₀"
        ) +
        annotate(
            "text", x = 2, y = 0.05, vjust = 1.7, label = "Reject H₀"
        ) +
        annotate(
            "label", x = 0, y = 0.05, hjust = 0,
            label = "α = 0.05", colour = my_pal[3]
        ) +
        labs(x = "Failed detections y", y = "CDF(y | n = 75, p = 0.15)") +
        scale_x_continuous(breaks = seq(0, 12, by = 3)) +
        coord_cartesian(expand = FALSE, ylim = c(0, 0.7))

The discrepancy with the Bayesian result can be attributed to the influence of the prior in the Bayesian analysis. The chosen prior is shifting the posterior distribution slightly to the left, making it less probable for \(p > p_0\) to exceed the 5% threshold.

The Lighthouse problem

A lighthouse is situated at a position \(\alpha\) along the shore and a distance \(\beta\) out to sea. The lighthouse emits a series of short, highly collimated flashes at random intervals and random angles. On the coast, we have placed photo-detectors that record only the position \(x_k\) of each flash’s arrival, but not the angle of emission.

We have recorded \(N\) flashes at positions \({x_k}\), and our goal is to use this information to estimate the lighthouse’s position.

If we assign a uniform Likelihood PDF to the azimuth angle \(\theta_k\),

\[ P(\theta_k \;\vert\; \alpha, \beta) = \frac{1}{\pi}, \]

where \(\theta_k\) is connected to \(\alpha\) and \(\beta\) by the relation

\[ x_k - \alpha = \beta \tan\theta_k. \]

If we then do a change of variables on the PDF we get

\[ P(x \;\vert\; \alpha, \beta) = \frac{\beta}{\pi[\beta^2 + (x - \alpha)^2]}, \]

which is a Cauchy distribution.

Suppose that we do not know both the position along the shore \(\alpha\) and the distance out to sea \(\beta\). In order to estimate the posterior probability of these two parameters given the observed flash positions, we will perform a Bayesian analysis.

To simulate this scenario, we set the true values of \(\alpha\) and \(\beta\) to be 1 km and 5 km, respectively. We sample the positions by generating them from a uniform distribution on the angles \(\theta_k\).

nflash <- 300
a_true <- 1
b_true <- 5

data <- a_true + b_true * tan(runif(nflash, -pi, pi))

Then, we define the grids for the \(\alpha\) and \(\beta\) parameters, and assign a uniform prior to both. We assume \(\alpha\) to be anywhere between -2 km and 2 km and \(\beta\) anywhere between 1 km and 8 km.

ngrid <- 500
min_a <- -2
max_a <- 2
min_b <- 1
max_b <- 9
alpha <- seq(min_a, max_a, length.out = ngrid)
beta <- seq(min_b, max_b, length.out = ngrid)

Now we come to the likelihood calculation. Since we’re taking products to get the overall likelihood of the data, it’s more convenient to take the logarithm first and later exponentiate the log-posterior.

loglhood <- outer(
    alpha, beta,
    \(a, b) Reduce(
        \(x, y) x + log(b) - log(b^2 + (y - a)^2), data, init = 0
    )
)

Now we can proceed by exponentiating the log-likelihood and normalize the result to get the posterior.

# 2d step size
dab <- (max_a - min_a) * (max_b - min_b) / ngrid^2
# add max to avoid underflow
lhood <- exp(loglhood - max(loglhood))
# normalize
post <- lhood / (sum(lhood) * dab)

Let’s compute the posterior moments and a 95% credibility interval, both for \(\alpha\) and for \(\beta\).

# modes
max_idx <- which(post == max(post), arr.ind = TRUE)
mode_a <- alpha[max_idx[1]]
mode_b <- beta[max_idx[2]]

# means
mean_a <- sum(alpha * post) * dab
mean_b <- sum(beta * t(post)) * dab

# standard deviations
std_a <- sum((alpha - mean_a)^2 * post) * dab
std_b <- sum((beta - mean_b)^2 * t(post)) * dab

# cdfs
cdf_a <- cumsum(rowSums(post)) * dab
cdf_b <- cumsum(colSums(post)) * dab

# credibility intervals
thr <- c(0.025, 0.975)
cred_a <- sapply(thr, \(x) alpha[which.max(cdf_a > x)])
cred_b <- sapply(thr, \(x) beta[which.max(cdf_b > x)])
Posterior parameters
Variable Mode Mean St. dev. 95% cred. int.
α 1.04 1.03 0.15 0.26, 1.78
β 4.86 4.94 0.17 4.19, 5.78

The results are quite good, knowing the true values of \(\alpha\) and \(\beta\). We have to point out though that we’ve chosen a pretty lucky seed; in most runs with nflash = 300 the credibility intervals easily fell outside the true values. Of course, the situation gets better as we increase nflash.

We’ll make a contour plot with ggplot, so we’ll need to convert the data to a long-format tibble first.

as.data.frame(post) |>
    # add row index column
    rowid_to_column("row") |>
    pivot_longer(-row, names_to = "col", values_to = "post") |>
    # add alpha and beta from row/col index
    mutate(a = alpha[row], b = beta[as.numeric(gsub("V", "", col))]) |>
    select(a, b, post) |>
    # plotting
    ggplot(aes(x = a, y = b, z = post)) +
        geom_contour(aes(colour = after_stat(level)), linewidth = 0.8) +
        scale_colour_viridis_c() +
        labs(
            x = "Position along the shore α",
            y = "Distance out to sea β",
            colour = "Posterior\nprobability"
        ) +
        scale_y_continuous(breaks = seq(min_b, max_b, length.out = 5)) +
        coord_cartesian(
            expand = FALSE, xlim = c(min_a, max_a), ylim = c(min_b, max_b)
        )